rm(list=ls())
library(readstata13)
library(modelsummary)
library(data.table)
library(splitstackshape)
library(dplyr)
library(fastDummies)
library(rdrobust)
library(lubridate) 

#Three waves are created. Each wave has data from two rounds - w01 (baseline and follow-up 2) ,  w23 (follow up 3 and 4), and w45 (follow up 5 and 6).
#For each pair, the blood pressure is measured in the earlier wave
#BP,the main independent variable is calculated using the average of the last two readings.
#For each pair, the outcome variables are measured in the later wave
#There are two outcome variables: hypertension diagnosis (hypdiag) and hypertension treatment (hypdrug)
#The structure of the final dataset is long. Each person id can have between 1 and 3 rows linked to it. The pid + wave variable uniquely identifies each row.

# working directory
  setwd("")

# input stata dataset
  raw <- read.dta13("")
  base<-as.data.table(raw)

###########################################################################################
# General preparations
###########################################################################################
  
# sort ids by year of interview
  datasummary(pid + hhp_id ~ N, data=base)
  base<-base[order(hhp_id,pid,entryd)]

# creating a variable with year of survey and months of survey
  fnames<- grep("entry", names(base), value=T)
  for (i in fnames) {
    temp <- tstrsplit(base[[i]], "-")
    set(base, j = sprintf("%s_%d", i, seq_along(temp)), value = temp)
  }
  # removing day variables
    base<-base[,c("entryd_3"  ,    "entryd_fup1_3",
                  "entryd_fup2_3", "entryd_fup3_3", "entryd_fup4_3" ,"entryd_fup5_3"):=NULL] 
    
    rm(fnames,temp,i,raw)
  
  # renaming year and month variables; saving as numeric
    base<-setnames(base,old=c("entryd_1", "entryd_2" ,   
                              "entryd_fup1_1" ,  "entryd_fup1_2",     
                              "entryd_fup2_1" ,  "entryd_fup2_2", 
                              "entryd_fup3_1" ,  "entryd_fup3_2", 
                              "entryd_fup4_1" ,  "entryd_fup4_2",      
                              "entryd_fup5_1" ,  "entryd_fup5_2" ), new=c("year_0", "month_0",
                                                                          "year_1", "month_1",
                                                                          "year_2", "month_2",
                                                                          "year_3", "month_3",
                                                                          "year_4", "month_4",
                                                                          "year_5", "month_5"))
    numer <- c("year_0", "month_0",
               "year_1", "month_1",
               "year_2", "month_2",
               "year_3", "month_3",
               "year_4", "month_4",
               "year_5", "month_5")

    base[, numer] <- base[, lapply(.SD, as.numeric), .SDcols = numer]
    rm(numer)

# creating a variable to indicate the if the respondent is present in the wave consecutive to a blood pressure reading
  base<-base[,w01:=ifelse(is.na(year_0)=="FALSE" & is.na(year_1)=="FALSE",1,0)] # baseline and follow up 1
  base<-base[,w23:=ifelse(is.na(year_2)=="FALSE" & is.na(year_3)=="FALSE",1,0)] # follow up 2 and follow up 3
  base<-base[,w45:=ifelse(is.na(year_4)=="FALSE" & is.na(year_5)=="FALSE",1,0)]  # follow up 4 and follow up 5


# female variable
  base<-base[,female:=ifelse(pd_sex=="Female",1,ifelse(pd_sex=="Male",0,NA))]

# religion variable  
  base<-base[, religion:= ifelse(pd_relig=="Hindu",0,ifelse(pd_relig=="Muslim",1,2))]
  base$religion <- factor(base$religion,labels =c("Hindu","Muslim","Others"))

# observations by waves
  base<-base[,w_all:=ifelse(w01==1 & w23==1 & w45==1,1,0)]
  datasummary(w01 + w23 +w45 + w_all~ N + mean, data=base)

###########################################################################################
# Cleaning data by waves - for all these waves, we keep only paired observation
###########################################################################################
# Creating an attrition tracking matrix
  samplesize<-matrix(nrow=9,ncol=4)
  samplesize[1,1] <-"Baseline"
  samplesize[2,1] <-"Age eligible"
  samplesize[3,1] <-"Paired and non-missing sex"
  samplesize[4,1] <-"Non-missing baseline blood pressure readings"
  samplesize[5,1] <-"Non-missing hypertension diagnosis variables"
  samplesize[6,1] <-"Meets inclusion criteria for hypertension diagnosis analysis"
  samplesize[7,1] <-"Non-missing hypertension treatment follow-up variables"
  samplesize[8,1] <-"Meets inclusion criteria for hypertension treatment analysis"
  colnames(samplesize) <- c("Observations", "Pair1", "Pair2", "Pair3")

# Step 1 - Raw count of observations
  samplesize[1,2] <-nrow(base[is.na(year_0)=="FALSE",])
  samplesize[1,3] <-nrow(base[is.na(year_2)=="FALSE",])
  samplesize[1,4] <-nrow(base[is.na(year_4)=="FALSE",])


################################################################################
# Cleaning Pair 01 
################################################################################
# Step 2 - Keep people 30 years and older at pair baseline
  base <- base[, age_calc:= trunc((pd_dob %--% entryd) / years(1))]
  samplesize[2,2] <-nrow(base[is.na(year_0)=="FALSE" & age_calc>=30,])
  
# Step 3 - Keep people who have paired observations, aged 30+ with non-missing sex
  w01 <- base[w01==1,]
  w01 <- w01[age_calc>=30,] 
  w01 <- na.omit(w01, cols=(c("female")))  
  samplesize[3,2] <- nrow(w01)
  
# Step 4 - Drop respondents with missing BP measurements at pair baseline
  w01 <- w01[is.na(systolic_bp_second)=="FALSE" & is.na(systolic_bp_first)=="FALSE",]
  w01 <- w01[is.na(diastolic_bp_second)=="FALSE" & is.na(diastolic_bp_first)=="FALSE",]
  samplesize[4,2] <- nrow(w01)

# time between baseline and follow-up
  w01 <- w01[, sdiff:= (entryd %--% entryd_fup1) / days(30)]
  w01$sdiff <- ifelse(w01$sdiff < 0, NA, w01$sdiff)

# age categories (<40 and 40+) 
  w01 <- w01[, age_calc:= trunc((pd_dob %--% entryd) / years(1))]
  summary(w01$age_calc)
  w01 <- w01[, agecat:= ifelse(age_calc<40,0, ifelse(age_calc>=40,1,NA))]

# level of education 
  # The labels are wrong. Primary school is actually "uncompleted primary school", 
  # High school refers to "secondary school completed". Secondary School / intermediary refers to High School. 
  # This can be verified by looking at the years of schooling in each of the categories.
  w01 <- w01[, edu:= ifelse(pd_edu_stat=="Illiterate" | pd_edu_stat=="Literate" | pd_edu_stat=="Primary school", 0,
                            ifelse(pd_edu_stat == "High school",1,
                                   ifelse(pd_edu_stat=="Secondary school/ Intermediary" | pd_edu_stat=="Graduate" | pd_edu_stat=="Professional degree/post graduate", 2, NA)))]
  table(w01$edu)
  
# prior diagnosis of cardiometabolic disease
  w01 <- w01[,diabetesknow:=ifelse(pd_diabetes=="Yes",1,ifelse(pd_diabetes=="No",0,NA))] # individual reports diabetes diagnosis at baseline
  w01 <- w01[,heartknow:=ifelse(pd_heart=="Yes",1,ifelse(pd_heart=="No",0,NA))] # individual reports heart attack at baseline
  w01 <- w01[,strokeknow:=ifelse(pd_stroke=="Yes",1,ifelse(pd_stroke=="No",0,NA))] # individual reports stroke at baseline
  w01 <- w01[,anyknow:=ifelse((diabetesknow==1 | heartknow==1 | strokeknow==1),1,ifelse((diabetesknow==0 & heartknow==0 & strokeknow==0),0,NA))] # individual reports any of the three CMDs at baseline
  
# BP measurements
  # based on the communication with the CARRS team, the average of the last two readings were used to inform respondents. 
  w01 <- w01[,sys:=ifelse(is.na(systolic_bp_third=="TRUE"),((systolic_bp_first+systolic_bp_second)/2),((systolic_bp_third+systolic_bp_second)/2))]
  w01 <- w01[,dias:=ifelse(is.na(diastolic_bp_third=="TRUE"),((diastolic_bp_first+diastolic_bp_second)/2),((diastolic_bp_third+diastolic_bp_second)/2))]
  
  #indicator on individual having high systolic or diastolic BP measurements
  w01 <- w01[,syshigh:=ifelse(sys>=140,1,0)]
  w01 <- w01[,diashigh:=ifelse(dias>=90,1,0)]
  w01 <- w01[,sysdiashigh:=ifelse(sys>=140 | dias>=90 ,1,0)]
  
  
## Inclusion criteria and outcomes
  #hypknow: individual knew that they had hypertension in the first (or any earlier) round of the paired wave.
  #hyptreatknow: individual took medication at pair baseline
  #hypdiag: individual was diagnosed in the year before the pair follow-up
  #hypdrug: individual started medicating in the year before the pair follow-up 1 

  # respondent reports prior diagnosis at pair baseline
    w01  <- w01[,hypknow:=ifelse(pd_hbp=="Yes",1,ifelse(pd_hbp=="No",0,NA))] 

  # respondent took medication at pair baseline
    w01 <- w01[is.na(hypknow)=="FALSE" & hbp_trt_allopdrug=="Yes", hyptreatknow:=1] 
    w01 <- w01[is.na(hypknow)=="FALSE" & is.na(hyptreatknow)=="TRUE", hyptreatknow:=0]



  # Outcome: hypertension diagnosis at pair follow-up
    w01 <- w01[,hypdiag:=ifelse((fo_hypert=="Yes"),1,ifelse(fo_hypert=="No",0,NA))]
  
  # Outcome: hypertension treatment at pair follow-up
    w01 <- w01[is.na(fo_hypert)=="FALSE" & fo_hypdrug=="Yes" & fo_hyppres=="Yes",hypdrug:=1]
    w01 <- w01[is.na(fo_hypert)=="FALSE" & is.na(hypdrug)=="TRUE" ,hypdrug:=0]

# keeping relevant variables 
  w01 <- w01 %>% dplyr:: select((grep("pd", names(w01))), "pid" ,"hhp_id","w01","sys","dias","hypknow","hyptreatknow", 
                               "hypdiag","hypdrug","syshigh", "sdiff", 
                               "diashigh","sysdiashigh","female","edu","religion","agecat","age_calc",
                               "diabetesknow" , "heartknow" , "strokeknow","anyknow","entryd","entryd_fup1","city")
  
  datasummary(sys +dias + hypdiag + hypknow + hypdrug + hyptreatknow + 
                diabetesknow + heartknow + strokeknow~ N + mean, data=w01[is.na(hypdiag)=="FALSE" & is.na(hypknow)=="FALSE" 
                                                                           & is.na(hypdrug)=="FALSE" &
                                                                            is.na(heartknow)=="FALSE" &
                                                                          is.na(strokeknow)=="FALSE" &
                                                                          is.na(diabetesknow)=="FALSE"])

##Since diagnosis and treatment have different inclusion criteria, creating separate datasets for each
  # Steps 5.1 and 5.2 - Keep individuals with information on diagnosis at pair baseline
  # these are the same individuals that also have information on treatment at pair baseline
    w01 <- w01[pd_hbp=="Yes" | pd_hbp=="No"] 

    
  # Step 5.1.2 - Keep individuals with information on diagnosis at pair follow-up
    w01_diag <- na.omit(w01, cols=(c("hypdiag"))) 
    samplesize[5,2] <- nrow(w01_diag)

  # Step 5.1.3 - Keep individuals who do not report prior diagnosis at pair baseline
    w01_diag <- w01_diag[hypknow==0] 
    samplesize[6,2] <- nrow(w01_diag)

    
  # Step 5.2.2 - Keep individuals with information on treatment at pair follow-up
    w01_treat <- na.omit(w01, cols=(c("hypdrug"))) 
    samplesize[7,2] <- nrow(w01_treat)

  # Step 5.2.3 - Keep individuals who do not report taking medication at pair baseline
    w01_treat <- w01[hyptreatknow==0]
    samplesize[8,2] <- nrow(w01_treat)


###########################################################################################
# Cleaning Pair 23
###########################################################################################
# Step 2 - Keep people 30 years and older at pair baseline
    base <- base[, age_calc:= trunc((pd_dob %--% entryd_fup2) / years(1))]
    samplesize[2,3] <-nrow(base[is.na(year_2)=="FALSE" & age_calc>=30,])
    
# Step 3 - Keep people who have paired observations, aged 30+ with non-missing sex
    w23 <- base[w23==1,]
    w23 <- w23[, age_calc:= trunc((pd_dob %--% entryd_fup2) / years(1))]
    w23 <- w23[age_calc>=30,] 
    w23 <-na.omit(w23, cols=(c("female")))  
    samplesize[3,3] <- nrow(w23)
    
# Step 4 - Drop respondents with missing BP measurements at pair baseline
  w23 <- w23[is.na(sbp_fu2_2)=="FALSE" & is.na(sbp_fu2_1)=="FALSE",]
  w23 <- w23[is.na(dbp_fu2_2)=="FALSE" & is.na(dbp_fu2_1)=="FALSE",]
  samplesize[4,3] <- nrow(w23)

# Time between baseline and follow-up
  w23 <- w23[, sdiff:= (entryd_fup2 %--% entryd_fup3) / days(30)]
  w23$sdiff <- ifelse(w23$sdiff < 0, NA, w23$sdiff)

# age categories (<40 and 40+) 
  w23 <- w23[, agecat:= ifelse(age_calc<40,0, ifelse(age_calc>=40,1,NA))]

# level of education
  # The labels are wrong. Primary school is actually "uncompleted primary school", 
  # High school refers to "secondary school completed". Secondary School / intermediary refers to High School. 
  # This can be verified by looking at the years of schooling in each of the categories.
  # We used education category from the first survey wave
  w23 <- w23[, edu:= ifelse(pd_edu_stat=="Illiterate" | pd_edu_stat=="Literate" | pd_edu_stat=="Primary school", 0,
                            ifelse(pd_edu_stat== "High school",1,
                                   ifelse(pd_edu_stat=="Secondary school/ Intermediary" | pd_edu_stat=="Graduate" | pd_edu_stat=="Professional degree/post graduate", 2, NA)))]
  table(w23$edu)

# prior diagnosis of cardiometabolic disease
  w23 <- w23[,diabetesknow:=ifelse(mh_diab=="Yes" | pd_diabetes=="Yes" ,1,ifelse(mh_diab=="No",0,NA))] # individual reports diabetes diagnosis at baseline
  w23 <- w23[,heartknow:=ifelse(mh_heart=="Yes" | pd_heart=="Yes",1,ifelse(mh_heart=="No",0,NA))] # individual reports heart attack at baseline
  w23 <- w23[,strokeknow:=ifelse(mh_stroke=="Yes" | pd_heart=="Yes",1,ifelse(mh_stroke=="No",0,NA))] # individual reports stroke at baseline
  w23 <- w23[,anyknow:=ifelse((diabetesknow==1 | heartknow==1 | strokeknow==1),1,ifelse((diabetesknow==0 & heartknow==0 & strokeknow==0),0,NA))] # individual reports any of the three CMDs at baseline
  
  
# BP measurements
  # based on the communication with the CARRS team, the average of the last two readings were used to inform respondents
  w23 <- w23[,sys:=ifelse(is.na(sbp_fu2_3=="TRUE"),((sbp_fu2_1+sbp_fu2_2)/2),((sbp_fu2_3+sbp_fu2_2)/2))]
  w23 <- w23[,dias:=ifelse(is.na(dbp_fu2_3=="TRUE"),((dbp_fu2_1+dbp_fu2_2)/2),((dbp_fu2_3+dbp_fu2_2)/2))]

#indicator on individual having high systolic or diastolic BP measurements
  w23 <- w23[,syshigh:=ifelse(sys>=140,1,0)]
  w23 <- w23[,diashigh:=ifelse(dias>=90,1,0)]
  w23 <- w23[,sysdiashigh:=ifelse(sys>=140 | dias>=90 ,1,0)]

## Inclusion criteria and outcomes
  #hypknow: individual knew that they had hypertension in the first (or any earlier) round of the paired wave.
  #hyptreatknow: individual took medication at pair baseline
  #hypdiag: individual was diagnosed in the year before the pair follow-up
  #hypdrug: individual started medicating in the year before the pair follow-up 1 
  
  #respondent reports prior diagnosis at pair baseline or any previous wave
    w23 <- w23[,hypknow:=ifelse((mh_hbp=="Yes"| pd_hbp=="Yes" | fo_hypert=="Yes"),1,
                              ifelse(is.na(mh_hbp)=="TRUE" & is.na(pd_hbp)=="TRUE" & is.na(fo_hypert)=="TRUE",NA,0))]
    w23 <- w23[mh_hbp=="Yes"| pd_hbp=="Yes" | fo_hypert=="Yes",hypknow:=1] 
    w23 <- w23[is.na(hypknow)=="TRUE",hypknow:=0] 
    w23 <- w23[is.na(mh_hbp)=="TRUE" & is.na(pd_hbp)=="TRUE" & is.na(fo_hypert)=="TRUE",hypknow:=NA]

  # respondent took medication at pair baseline
    w23 <- w23[hbp_allopathic=="Yes" , hyptreatknow:=1] #respondents who were taking bp medication in baseline of this pair/follow up 2 or wave 3 
    w23 <- w23[(is.na(hyptreatknow)=="TRUE" & is.na(hypknow)=="FALSE" ) , hyptreatknow:=0] #respondents who  were taking bp medication  in w01line 

  # Outcome: hypertension diagnosis at pair follow-up
    w23 <- w23[,hypdiag:=ifelse((f3_mh_hbp=="Yes" & f3_mh_hbp_howlong<13 & f3_mh_hbp_howlong>=0),1,ifelse(f3_mh_hbp=="No",0,NA))]
  
  # Outcome: hypertension treatment at pair follow-up
    w23 <- w23[is.na(f3_mh_hbp)=="FALSE" & f3_hbp_allopathic=="Yes" ,hypdrug:=1]
    w23 <- w23[is.na(f3_mh_hbp)=="FALSE" & is.na(hypdrug)=="TRUE" ,hypdrug:=0]

  # keeping relevant variables 
    w23 <- w23 %>% dplyr:: select((grep("pd", names(w23))),"pid" ,"hhp_id","w23","sys","dias","hypknow", "hyptreatknow","hypdiag",
                                 "sdiff", "hypdrug","syshigh","diashigh","sysdiashigh","female","edu","religion","agecat","age_calc",
                                 "diabetesknow" , "heartknow" , "strokeknow","anyknow","entryd_fup3","entryd_fup2","mh_hbp","city")

    datasummary(sys +dias + hypdiag + hypknow + hypdrug + hyptreatknow + 
                  diabetesknow + heartknow + strokeknow~ N + mean, data=w23[is.na(hypdiag)=="FALSE" & is.na(hypknow)=="FALSE" 
                                                                            & is.na(hypdrug)=="FALSE" &
                                                                              is.na(heartknow)=="FALSE" &
                                                                              is.na(strokeknow)=="FALSE" &
                                                                              is.na(diabetesknow)=="FALSE"])

##Since diagnosis and treatment have different inclusion criteria, creating separate datasets for each
  # Steps 5.1 and 5.2 - Keep individuals with information on diagnosis at pair baseline
  # these are the same individuals that also have information on treatment at pair baseline
    w23 <- w23[mh_hbp=="Yes" | mh_hbp=="No"] 

  # Step 5.1.2 - Keep individuals with information on diagnosis at pair follow-up
    w23_diag <- na.omit(w23, cols=(c("hypdiag"))) 
    samplesize[5,3] <- nrow(w23_diag)
    
  # Step 5.1.3 - Keep individuals who do not report prior diagnosis at pair baseline
    w23_diag <- w23_diag[hypknow==0] 
    samplesize[6,3] <- nrow(w23_diag)

    
  # Step 5.2.2 - Keep individuals with information on treatment at pair follow-up
    w23_treat <- na.omit(w23, cols=(c("hypdrug"))) 
    samplesize[7,3] <- nrow(w23_treat)

  # Step 5.2.3 - Keep individuals who do not report taking medication at pair baseline
    w23_treat <- w23[hyptreatknow==0] 
    samplesize[8,3] <- nrow(w23_treat)

################################################################################
# Cleaning Pair 45
################################################################################
# Step 2 - Keep people 30 years and older at pair baseline
  base <- base[, age_calc:= trunc((pd_dob %--% entryd_fup4) / years(1))]
  samplesize[2,4] <-nrow(base[is.na(year_4)=="FALSE" & age_calc>=30,])

# Step 3 - Keep people who have paired observations, aged 30+ with non-missing sex
  w45 <-base[w45==1,] 
  w45 <- w45[, age_calc:= trunc((pd_dob %--% entryd_fup4) / years(1))]
  w45 <- w45[age_calc>=30,] 
  w45 <-na.omit(w45, cols=(c("female")))  
  samplesize[3,4] <- nrow(w01)
  
# Step 4 - Drop respondents with missing BP measurements at pair baseline
  w45 <- w45[is.na(bp_s2)=="FALSE" & is.na(bp_s1)=="FALSE",]
  w45 <- w45[is.na(bp_d2)=="FALSE" & is.na(bp_d1)=="FALSE",]
  samplesize[4,4] <- nrow(w45)

# time between baseline and follow-up
  w45 <- w45[, sdiff:= (entryd_fup4 %--% entryd_fup5) / days(30)]
  w45$sdiff <- ifelse(w45$sdiff < 0, NA, w45$sdiff)

# age categories (<40 and 40+) 
  w45 <- w45[, age_calc:= trunc((pd_dob %--% entryd_fup4) / years(1))]
  summary(w45$age_calc)
  w45 <- w45[, agecat:= ifelse(age_calc<40,0, ifelse(age_calc>=40,1,NA))]

# level of education 
  w45 <- w45[, edu:= ifelse(edu_stat==5 | edu_stat==6 | edu_stat==7, 0,
                            ifelse(edu_stat == 4,1,
                                   ifelse(edu_stat==1 | edu_stat==2 | edu_stat==3, 2, NA)))]
  table(w45$edu)
  
# prior diagnosis of cardiometabolic disease
  # different to previous waves, individuals were asked about "ever". This is why we do not have to consider information
  # from the other waves. 
  w45 <- w45[,diabetesknow:=ifelse(dia_dis=="Yes",1,ifelse(dia_dis=="No",0,NA))] # individual reports diabetes diagnosis at baseline
  w45 <- w45[,heartknow:=ifelse(hrt_dis=="Yes",1,ifelse(hrt_dis=="No",0,NA))] # individual reports heart attack at baseline
  w45 <- w45[,strokeknow:=ifelse(stroke=="Yes",1,ifelse(stroke=="No",0,NA))] # individual reports stroke at baseline
  w45 <- w45[,anyknow:=ifelse((diabetesknow==1 | heartknow==1 | strokeknow==1),1,ifelse((diabetesknow==0 & heartknow==0 & strokeknow==0),0,NA))] # individual reports any of the three CMDs at baseline
  

# BP measurements
# based on the communication with the CARRS team, the average of the last two readings were used to inform respondents. 
  w45 <- w45[is.na(bp_s3)=="TRUE",sys:=(bp_s1+bp_s2)/2]
  w45 <- w45[is.na(sys)=="TRUE" & is.na(bp_s3)=="FALSE",sys:=((bp_s3+bp_s2)/2)]
  w45 <- w45[is.na(bp_d3)=="TRUE",dias:=(bp_d1+bp_d2)/2]
  w45 <- w45[is.na(dias)=="TRUE" & is.na(bp_d3)=="FALSE",dias:=((bp_d3+bp_d2)/2)]
  
#indicator on individual having high systolic or diastolic BP measurements
  w45 <- w45[,syshigh:=ifelse(sys>=140,1,0)]
  w45 <- w45[,diashigh:=ifelse(dias>=90,1,0)]
  w45 <- w45[,sysdiashigh:=ifelse(sys>=140 | dias>=90 ,1,0)]


## Inclusion criteria and outcomes
  #hypknow: individual knew that they had hypertension in the first (or any earlier) round of the paired wave.
  #hyptreatknow: individual took medication at pair baseline
  #hypdiag: individual was diagnosed in the year before the pair follow-up
  #hypdrug: individual started medicating in the year before the pair follow-up 1 
  
  #respondent reports prior diagnosis at pair baseline
    w45 <- w45[mh_hbp=="Yes"| pd_hbp=="Yes" | fo_hypert=="Yes" | hbp_dis=="Yes",hypknow:=1] 
    w45 <- w45[is.na(hypknow)=="TRUE",hypknow:=0] 
    w45 <- w45[is.na(mh_hbp)=="TRUE" & is.na(pd_hbp)=="TRUE" & is.na(fo_hypert)=="TRUE" & is.na(hbp_dis=="Yes")=="TRUE",hypknow:=NA]

  # respondent took medication at pair baseline
    w45 <- w45[hbp_trt_all=="Yes", hyptreatknow:=1] #respondents who were taking bp medication in the baseline of this pair
    w45 <- w45[(is.na(hyptreatknow)=="TRUE" & is.na(hypknow)=="FALSE" ) , hyptreatknow:=0] #respondents who were taking bp medication in the baseline of this pair


  # Outcome: hypertension diagnosis at pair follow-up
    w45 <- w45[(f5_hbp=="Yes" | f5_hbp=="No")  & 
            ((f5_hbp_yr==1 & f5_hbp_mon==0) | (f5_hbp_yr==0 & f5_hbp_mon<13 & f5_hbp_mon>=0)),hypdiag:=1]
    w45 <- w45[(f5_hbp=="Yes" | f5_hbp=="No") & is.na(hypdiag)=="TRUE",hypdiag:=0]

  # Outcome: hypertension treatment at pair follow-up
    w45 <- w45[(f5_hbp=="Yes" | f5_hbp=="No") & f5_hbp_allo=="Yes" &
            ((f5_hbp_yr==1 & f5_hbp_mon==0) | (f5_hbp_yr==0 & f5_hbp_mon<13 & f5_hbp_mon>=0)), hypdrug:=1]
    w45 <- w45[((f5_hbp=="Yes" | f5_hbp=="No") & is.na(hypdrug)=="TRUE"),hypdrug:=0]


# keeping relevant variables 
  w45 <- w45 %>% dplyr:: select((grep("pd", names(w45))), (grep("own", names(w45))), (grep("fuel", names(w45))),
                           (grep("emp", names(w45))), (grep("drink", names(w45))), (grep("toilet", names(w45))),
                           "sdiff", "pid" ,"hhp_id","w45","sys","dias","hypknow", "hyptreatknow", "hypdiag","hypdrug","syshigh","diashigh",
                           "sysdiashigh","edu_stat","female","edu","religion","agecat","age_calc",
                           "diabetesknow" , "heartknow" , "strokeknow","anyknow","f5_hbp_yr","f5_hbp_mon",
                           "entryd_fup4","entryd_fup5","year_5","month_5","hbp_dis","city")

  datasummary(sys +dias + hypdiag + hypknow + hypdrug + hyptreatknow + 
                diabetesknow + heartknow + strokeknow~ N + mean, data=w45[is.na(hypdiag)=="FALSE" & is.na(hypknow)=="FALSE" 
                                                                          & is.na(hypdrug)=="FALSE" &
                                                                            is.na(heartknow)=="FALSE" &
                                                                            is.na(strokeknow)=="FALSE" &
                                                                            is.na(diabetesknow)=="FALSE"])

##Since diagnosis and treatment have different inclusion criteria, creating separate datasets for each
  # Steps 5.1 and 5.2 - Keep individuals with information on diagnosis at pair baseline
  # these are the same individuals that also have information on treatment at pair baseline
    w45 <- w45[hbp_dis=="Yes" |hbp_dis=="No", ] 
  
  # Step 5.1.2 - Keep individuals with information on diagnosis at pair follow-up
    w45_diag <- na.omit(w45, cols=(c("hypdiag"))) 
    samplesize[5,4] <- nrow(w45_diag)
  
  # Step 5.1.3 - Keep individuals who do not report prior diagnosis at pair baseline
    w45_diag <- w45_diag[hypknow==0] 
    samplesize[6,4] <- nrow(w45_diag)
  
  # Step 5.2.2 - Keep individuals with information on treatment at pair follow-up
    w45_treat <- na.omit(w45, cols=(c("hypdrug"))) 
    samplesize[7,4] <- nrow(w45_treat)

  # Step 5.2.3 - Keep individuals who do not report taking medication at pair baseline
    w45_treat <- w45[hyptreatknow==0] 
    samplesize[8,4] <- nrow(w45_treat)


###########################################################################################
# Creating the analysis datasets
###########################################################################################

#identifying the different rounds
  cols = c("w01", "w23", "w45")

# dataset for diagnosis outcome
  carrs_selected <- rbindlist(list(w01_diag,w23_diag,w45_diag),fill = TRUE)
  carrs_selected[ , (cols) := lapply(.SD, nafill, fill=0), .SDcols = cols]  

# dataset for treatment outcome
  carrs_full <- rbindlist(list(w01_treat,w23_treat,w45_treat),fill = TRUE) 
  carrs_full[ , (cols) := lapply(.SD, nafill, fill=0), .SDcols = cols]  

#ordering the observations
  setcolorder(carrs_full,c("hhp_id" , "pid" ,  "sys" , "dias" , "hypknow" , "hypdiag" , "hypdrug", "w01" , "w23" , "w45","edu","city"))
  setcolorder(carrs_selected,c("hhp_id" , "pid" ,  "sys" , "dias" , "hypknow" , "hypdiag" , "hypdrug", "w01" , "w23" , "w45","edu","city"))

#  defining the following columns as factor for the summary characteristic tables
  cols <- c("diabetesknow","heartknow", "strokeknow", "religion")
  setDT(carrs_full )[, (cols):= lapply(.SD, factor), .SDcols=cols]
  setDT(carrs_selected )[, (cols):= lapply(.SD, factor), .SDcols=cols]

#creating a factor version of the education and age category variables and labeling them for the summary characteristic tables
  carrs_full$eduf <- factor(carrs_full$edu,labels =c("Primary","Secondary","Tertiary"))
  carrs_full$agecatf <- factor(carrs_full$agecat,labels=c("$<$40 years","$\\geq$40 years"))
  
  carrs_selected$eduf <- factor(carrs_selected$edu,labels =c("Primary","Secondary","Tertiary"))
  carrs_selected$agecatf <- factor(carrs_selected$agecat,labels=c("$<$40 years","$\\geq$40 years"))

###########################################################################################
# Running variables for diagnosis outcome
###########################################################################################

# main analysis
  carrs_selected<-carrs_selected[,runningvar:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                     ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs_selected<-carrs_selected[female==1,runningvar_female:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                     ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  
  carrs_selected<-carrs_selected[female==0,runningvar_male:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                   ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  
# by education group
  carrs_selected<-carrs_selected[edu == 0,runningvar_edu0:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                  ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs_selected<-carrs_selected[edu == 1,runningvar_edu1:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                  ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs_selected<-carrs_selected[edu == 2,runningvar_edu2:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                  ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  
  carrs_selected<-carrs_selected[female==1 & edu == 0,runningvar_female_edu0:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                                     ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs_selected<-carrs_selected[female==1 & edu == 1,runningvar_female_edu1:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                                     ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs_selected<-carrs_selected[female==1 & edu == 2,runningvar_female_edu2:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                                     ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  
  carrs_selected<-carrs_selected[female==0 & edu == 0,runningvar_male_edu0:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                                   ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs_selected<-carrs_selected[female==0 & edu == 1,runningvar_male_edu1:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                                   ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs_selected<-carrs_selected[female==0 & edu == 2,runningvar_male_edu2:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                                   ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
# by age group
  carrs_selected<-carrs_selected[agecat == 0,runningvar_age0:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                     ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs_selected<-carrs_selected[agecat == 1,runningvar_age1:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                     ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs_selected<-carrs_selected[female==1 & agecat == 0,runningvar_female_age0:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                                        ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs_selected<-carrs_selected[female==1 & agecat == 1,runningvar_female_age1:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                                        ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs_selected<-carrs_selected[female==0 & agecat == 0,runningvar_male_age0:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                                      ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs_selected<-carrs_selected[female==0 & agecat == 1,runningvar_male_age1:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                                      ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  
###########################################################################################
# Running variables for treatment outcome
###########################################################################################

# main analysis
  carrs_full<-carrs_full[,runningvar:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                          ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
                                                                 
  carrs_full<-carrs_full[female==1,runningvar_female:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                   ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  
  carrs_full<-carrs_full[female==0,runningvar_male:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                     ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]


# by education group
  carrs_full<-carrs_full[edu == 0,runningvar_edu0:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                          ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs_full<-carrs_full[edu == 1,runningvar_edu1:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                          ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs_full<-carrs_full[edu == 2,runningvar_edu2:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                          ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  
  carrs_full<-carrs_full[female==1 & edu == 0,runningvar_female_edu0:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                             ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs_full<-carrs_full[female==1 & edu == 1,runningvar_female_edu1:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                             ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs_full<-carrs_full[female==1 & edu == 2,runningvar_female_edu2:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                             ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  
  carrs_full<-carrs_full[female==0 & edu == 0,runningvar_male_edu0:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                           ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs_full<-carrs_full[female==0 & edu == 1,runningvar_male_edu1:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                           ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs_full<-carrs_full[female==0 & edu == 2,runningvar_male_edu2:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                         ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]

# by age group
  carrs_full<-carrs_full[agecat == 0,runningvar_age0:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                             ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs_full<-carrs_full[agecat == 1,runningvar_age1:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                             ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs_full<-carrs_full[female==1 & agecat == 0,runningvar_female_age0:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                                ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs_full<-carrs_full[female==1 & agecat == 1,runningvar_female_age1:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                                ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs_full<-carrs_full[female==0 & agecat == 0,runningvar_male_age0:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                              ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs_full<-carrs_full[female==0 & agecat == 1,runningvar_male_age1:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                              ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]



###########################################################################################
# Summaries of dataset
###########################################################################################
# Diagnosis dataset
  datasummary(w01 + w23 + w45 ~ N , data=carrs_selected)
  datasummary(sys +dias + hypdiag + hypknow + hypdrug ~ N + mean, data=carrs_selected[is.na(hypdiag)=="FALSE" & is.na(hypknow)=="FALSE" 
                                                                                   & is.na(hypdrug)=="FALSE"])
  datasummary(sys +dias + hypdiag + hypknow + hypdrug ~ N , data=carrs_selected)
  

# Treatment dataset
  datasummary(w01 + w23 + w45 ~ N , data=carrs_full)
  datasummary(sys +dias + hypdiag + hypknow + hypdrug ~ N + mean, data=carrs_full[is.na(hypdiag)=="FALSE" & is.na(hypknow)=="FALSE" 
                                                                                  & is.na(hypdrug)=="FALSE"])
  datasummary(sys +dias + hypdiag + hypknow + hypdrug ~ N , data=carrs_full)

  samplesize<-as.data.table(samplesize)
  samplesize[, 2:ncol(samplesize)] <- lapply(2:ncol(samplesize), 
                                             function(x) as.numeric(samplesize[[x]]))
  samplesize<-samplesize %>% mutate(All=Pair1+Pair2+Pair3)


###########################################################################################
# Saving the datasets
###########################################################################################

write.csv(carrs_full, "carrs_full.csv", row.names=FALSE)
save(carrs_full, file = "carrs_full.Rdata")

write.csv(carrs_selected, "carrs_selected.csv", row.names=FALSE)
save(carrs_selected, file = "carrs_selected.Rdata")

write.csv(samplesize, "samplesize.csv", row.names=FALSE)
save(samplesize, file = "samplesize.Rdata")